#!/usr/local/bin/perl
# This package uses the global file handle htmlFile
# There are two global assoc. arrays, endTags & handlerDict
# parseHtml takes one argument, a filename
# and returns the parsed html in a string
sub parseHtml
{
# Declare variables to hold the arguments
local($fileName) = @_;
# Declare a variable to store the return value
local($retVal);
# Open the file
open(htmlFile,$fileName);
# If the file opened, call the parser on it
$retVal = &mainHtmlParser("",0) if htmlFile;
# Close the file
close(htmlFile);
# Return the string parsed from the file
return $retVal;
}
# mainHtmlParser takes several arguments
# This subroutine can either take a stop string, or a stop char
# it reads the file htmlFile until either the end of file
# the stopstring or the stop char is encountered.
#
# mainHtmlParser returns a string filtered from the file.
# The filters are tag handlers and a default handler.
# Handlers should take 5 arguments for:
#
# tagString - The string containing the tag
# argString - Any data between the tag and end tag
# endString - The end tag
# tagDict - The dictionary created using dictForTag
# userData - The user data argument
#
# Handlers are registered in the global dictionary
# handlerDict.
#
# If the tag has a matching end tag like and
# then the tag should be registered in the global
# %endTags array, with the value equal to its end tag.
#
# If the tag needs the data up to the end of the line, like
# OPTION, then if should appear in %endTags with the value
# "eol".
#
# Handlers should return the string to replace the tag with.
#
# The default is used for text that wasn't part of a tag.
# Tags are denoted by .
# As plain text is encountered the handler registered under
# the string "DEFAULT" is called.
sub mainHtmlParser
{
# Declare locals to store the arguments
local($stopStr,$stopChar) = @_;
# Declare several local variables
local($char,$inTag,$tmpBuffer,$mainBuffer);
# Initialize the main buffer, this is what is returned
$mainBuffer = "";
# $inTag is used to denote when we are inside <>'s
$inTag = 0;
# Loop until the end of the file, or
# we encounter the stop string or stop character.
do
{
# Get the next character from the file.
# This is not the most effecient method of reading a file
# But makes our code cleaner
$char = getc(htmlFile);
# Check if we are at the start of a tag
if($char eq "<")
{
# Dont allow any tags inside other tags
if($inTag)
{
die "This is an invalid html file.\n";
}
else
{
# Denote that we are in a tag
$inTag = 1;
# If we were reading plain text
if($tmpBuffer)
{
# Handle the plain text
$mainBuffer .= &handlePlainText($tmpBuffer);
# Reset the tmp buffer
$tmpBuffer = "";
}
# Start the new tmp buffer
$tmpBuffer = "<";
}
}
elsif($char eq ">") # Check if we are at the end of a tag
{
# Dont allow end tags without start tags
if(! $inTag)
{
die "This is an invalid html file.\n";
}
else
{
# Denote the end of the tag
$inTag = 0;
# Finish the tmp buffer
$tmpBuffer .= ">";
# See if we are at the stop string
if($stopStr && ($tmpBuffer =~ /$stopStr/i))
{
return $mainBuffer;#we have read to the stop string
}
else
{
# If not handle the tag, and keep reading
$tmpBuffer = &handleTag($tmpBuffer);
# Add the tmp buffer to the main buffer
$mainBuffer .= $tmpBuffer;
# Reset the tmp buffer
$tmpBuffer = "";
}
}
}
elsif(eof(htmlFile)
|| ($stopChar && ($char eq $stopChar))) # check for stopchar
{
# Dont allow the parsing to end inside a tag
if($inTag)
{
die "This is an invalid html file.\n";
}
else
{
# Add the character to the tmp buffer
$tmpBuffer .= $char if (!eof(htmlFile));
# Add the tmp buffer to the main buffer,
# after handling it.
$mainBuffer .= &handlePlainText($tmpBuffer);
# Reset the tmp buffer
$tmpBuffer = "";
}
# We are at the end of the file, or found
# the stop string, so return the main buffer
return $mainBuffer;
}
else # If nothing else add the character to the tmp buffer
{
$tmpBuffer .= $char;
}
}
until(eof(htmlFile));
# Return the main buffer
return $mainBuffer;
}
#
# handleTag actualy handles the tags for mainHtml parser
sub handleTag
{
# Declare local variables for the argument, as well
# as the other required locals.
local($tagString) = @_;
local(%tagDict,$endTag,$handler,$argString);
local($evalString);
# Create an associative array containing the data for the
# tag string.
%tagDict = &dictForTag($tagString);
# Look for an end tag. These are registered in the %endTags
# global associative array.
$endTag = $endTags{$tagDict{"TAG"}};
# Look for a handler subroutine for the tag.
# These are registered in the %handlerDict global
# associative array.
$handler = $handlerDict{$tagDict{"TAG"}};
# If no handler is found, treat the tag as plain text, and
# return the parsed data.
if(!($handler))
{
$tagString = &handlePlainText($tagString);
return $tagString;
}
# If the tag wants the data to the end of the line
# use mainHtmlParser to read to the end of the line, then
# call the tag's handler subroutine with the data to the
# end of the line.
if($endTag eq "eol") # Tag that needs data to eol
{
$argString = &mainHtmlParser("","\n");
$evalString = "&".$handler.'($tagString,$argString,0,%tagDict);';
}
elsif($endTag) # Tag with an end tag
{
# Use mainHtmlParser to read any text, up to
# the end tag. Remove the end tag from the sting.
$argString = &mainHtmlParser($endTag,0);
$argString =~ s/<.*>$//; # Remove the end tag
# Call the tag's handler
$evalString = "&".$handler.'($tagString,$argString,$endTag,%tagDict);';
}
else # General unary tag
{
#For unary tags, simply call the handler.
$evalString = "&".$handler.'($tagString,0,0,%tagDict);';
}
$tagString = eval($evalString);
# Return the parsed text.
return $tagString;
}
# handlePlainText actually handles plain text for htmlMainParser
sub handlePlainText
{
# Declare the locals
local($plainString) = @_;
local($handler,$evalString);
# Look for a default handler for plain text
$handler = $handlerDict{"DEFAULT"};
#If there is a handler, call it and catch the return value.
if($handler)
{
$evalString = "&".$handler.'($plainString,0,0,0);';
$plainString = eval($evalString);
}
# Return either the text passed in, or the parsed text if there
# was a default handler.
return $plainString;
}
# Creates an associative array for a tag string
sub dictForTag
{
# Declare locals
local($tagString) = @_;
local(%tagDict,$key);
# Look for the tag
# Remove it from the tag string
# Capitalize the tag, and put it into the dict
# with the key, TAG
# If no tag is found, then this is not a tag string.
if(($tagString =~ s/^<(\w*)[\s>]//) && $1)
{
($key = $1) =~ tr/a-z/A-Z/; # Make the tag upper case
$tagDict{"TAG"} = $key;
}
elsif(($tagString =~ s/^